# mTurk maker 
################################################################################
# Dependencies
################################################################################
library(data.table)
library(purrr)
library(lubridate)
library(readr)
library(dplyr)
library(readr)
library(readxl)
library(irr)
library(tidyr)
library(ggplot2)
library(tidyverse)
library(tidycomm)
################################################################################
# Setup
################################################################################
rm(list=ls())
# - set dir
args = commandArgs()

scriptName = args[substr(args,1,7) == '--file=']

if (length(scriptName) == 0) {
  scriptName <- rstudioapi::getSourceEditorContext()$path
} else {
  scriptName <- substr(scriptName, 8, nchar(scriptName))
}

pathName = substr(
  scriptName, 
  1, 
  nchar(scriptName) - nchar(strsplit(scriptName, '.*[/|\\]')[[1]][2])
)

setwd(pathName)

emo_vec <- read_file("./emoji_regex_vector.txt")

emojy_replace <- function(string, replacment, emo_vec){
  stringr::str_replace_all(string, emo_vec, replacment)
}
################################################################################
# Relevance
################################################################################
mturk_mael <- read_csv("../data/mTurk_data/training_data_tweetscongress_relevance.csv",
                       col_types = cols(status_id = col_character())) 

#mTurk Results
df <- read_csv("../data/mTurk_data/batch_results_tweetscongress_relevance_final.csv", 
               col_types = cols(ApprovalTime = col_character(), 
                                RejectionTime = col_character(), 
                                RequesterFeedback = col_character(),
                                Input.status_id = col_character(),
                                Approve = col_character(),
                                Reject = col_character()))
df <- df %>% filter(is.na(Input.relevant_ra) == F) %>% 
  mutate(Input.relevant_ra = ifelse(Input.relevant_ra == 1, "Relevant", "Irrelevnt"))

# Quality of mTurk Workers
worker_qual <- df %>% filter(Input.relevance_agreement_check == 1) %>% 
  mutate(worker_vs_ra = ifelse(Answer.category.label == Input.relevant_ra, 1, 0)) %>% 
  group_by(WorkerId) %>% 
  summarise(quality = sum(worker_vs_ra)/n(),
            n_questions = n())

summary(worker_qual$quality)

batch1_real <- df %>% filter(Input.relevance_agreement_check == 1) %>%
  filter(is.na(Answer.category.label) == F) %>% ungroup %>% 
  group_by(Input.status_id)

head(batch1_real$Input.status_id)

batch1_real$pseudo_coder_id <- rep_along(1:nrow(batch1_real), 1:2)
batch1_real <- batch1_real %>% group_by(Input.status_id,pseudo_coder_id) %>% distinct(Input.status_id,pseudo_coder_id, .keep_all = T)

batch1_real <- as_tibble(batch1_real)

# Agreement over all coders...
batch1_real %>% test_icr(unit_var = Input.status_id, coder_var = pseudo_coder_id, Answer.category.label)
batch1_real %>%  tab_frequencies(Answer.category.label)


# Accuracy over all 
batch_accuraacy <- batch1_real %>% select(c(Input.status_id,
                                            Input.relevance_agreement_check,
                                            Input.relevant_ra,Answer.category.label,pseudo_coder_id)) %>%
  pivot_wider(names_from = c(pseudo_coder_id), values_from = Answer.category.label) %>% 
  mutate(relevant_mturk = ifelse(`1` == `2`, `1`, FALSE))

batch_accuraacy <- batch_accuraacy %>% mutate(mturk_vs_ra = ifelse(relevant_mturk == Input.relevant_ra, T, F))

batch_accuraacy  %>% group_by(mturk_vs_ra) %>% summarise(n = n()) %>% mutate(f = n/sum(n))

################################################################################
# Frames
################################################################################
mturk_mael <- read_csv("../data/mTurk_data/training_data_tweetscongress_frames_final.csv",
                       col_types = cols(status_id = col_character())) 


#mTurk Results
df <- read_csv("../data/mTurk_data/batch_results_tweetscongress_frames_final.csv", 
               col_types = cols(ApprovalTime = col_character(), 
                                RejectionTime = col_character(), 
                                RequesterFeedback = col_character(),
                                Input.status_id = col_character(),
                                Approve = col_character(),
                                Reject = col_character()))
df <- df %>% mutate(Answer.category.label = tolower(Answer.category.label)) 

df <- df %>% mutate(Answer.category.label = ifelse(Answer.category.label == "economy","economic",Answer.category.label))
df <- df %>% filter(is.na(Input.frames_ra) == F)

# Quality of mTurk Workers
worker_qual <- df %>%
  mutate(worker_vs_ra = ifelse(Answer.category.label == Input.frames_ra, 1, 0)) %>% 
  group_by(WorkerId) %>% 
  summarise(quality = sum(worker_vs_ra)/n(),
            n_questions = n())

summary(worker_qual$quality)

batch1_real <- df %>%
  filter(is.na(Answer.category.label) == F) %>% ungroup %>% 
  group_by(Input.status_id)

head(batch1_real$Input.status_id)

batch1_real$pseudo_coder_id <- rep_along(1:nrow(batch1_real), 1:2)
batch1_real <- batch1_real %>% group_by(Input.status_id,pseudo_coder_id) %>% distinct(Input.status_id,pseudo_coder_id, .keep_all = T)

batch1_real <- as_tibble(batch1_real)

# Agreement over all coders...
batch1_real %>% test_icr(unit_var = Input.status_id, coder_var = pseudo_coder_id, Answer.category.label)
batch1_real %>%  tab_frequencies(Answer.category.label)


# Accuracy over all 
batch_accuraacy <- batch1_real %>% select(c(Input.status_id,
                                            Input.frames_ra,Answer.category.label,pseudo_coder_id)) %>%
  pivot_wider(names_from = c(pseudo_coder_id), values_from = Answer.category.label) %>% 
  mutate(relevant_mturk = ifelse(`1` == `2`, `1`, FALSE))

batch_accuraacy <- batch_accuraacy %>% mutate(mturk_vs_ra = ifelse(relevant_mturk == Input.frames_ra, T, F))

batch_accuraacy  %>% group_by(mturk_vs_ra) %>% summarise(n = n()) %>% mutate(f = n/sum(n))
